Merged [18576] and [18577]
[adiumx.git] / Other / Adium Spotlight Importer / GetMetadataForHTMLLog-Additions.m
blob3a048da88c999ec9d444792aee594b855f0e0e6c
1 //
2 //  GetMetadataForHTMLLog-Additions.m
3 //  AdiumSpotlightImporter
4 //
5 //  Created by Evan Schoenberg on 5/25/06.
6 //
8 #import "GetMetadataForHTMLLog-Additions.h"
10 //From LMX. Included under the BSD license. http://trac.adiumx.com/wiki/LMXParser
11 static BOOL getSurrogatesForUnicodeScalarValue(const UTF32Char scalar, unichar *outHigh, unichar *outLow);
14  * @brief These additions are all from AIUtilities
15  *
16  * The spotlight importer should include this file to get these specific additions.
17  * If the GetMetadataForHTMLLog class is used in a situation in which AIUtilities is linked in already, it is
18  * not necessary to include this implementation file.
19  */
20 @implementation NSScanner (AdiumSpotlightImporterAdditions)
22 - (BOOL)scanUnsignedInt:(unsigned int *)unsignedIntValue
24         //skip characters if necessary
25         NSCharacterSet *skipSet = [self charactersToBeSkipped];
26         [self setCharactersToBeSkipped:nil];
27         [self scanCharactersFromSet:skipSet intoString:NULL];
28         [self setCharactersToBeSkipped:skipSet];
29         
30         NSString *string = [self string];
31         NSRange range = NSMakeRange([self scanLocation], 0);
32         register unsigned length = [string length] - range.location; //register because it is used in the loop below.
33         range.length = length;
34         
35         unichar *buf = malloc(length * sizeof(unichar));
36         [string getCharacters:buf range:range];
37         
38         register unsigned i = 0;
39         
40         if (length && (buf[i] == '+')) {
41                 ++i;
42         }
43         if (i >= length) return NO;
44         if ((buf[i] < '0') || (buf[i] > '9')) return NO;
45         
46         unsigned total = 0;
47         while (i < length) {
48                 if ((buf[i] >= '0') && (buf[i] <= '9')) {
49                         total *= 10;
50                         total += buf[i] - '0';
51                         ++i;
52                 } else {
53                         break;
54                 }
55         }
56         [self setScanLocation:i];
57         *unsignedIntValue = total;
58         return YES;
61 @end
63 //From AIUtilities
64 @implementation NSString (AdiumSpotlightImporterAdditions)
66 BOOL AIGetSurrogates(UTF32Char in, UTF16Char *outHigh, UTF16Char *outLow)
68         if (in < 0x10000) {
69                 if (outHigh) *outHigh = 0;
70                 if (outLow)  *outLow  = in;
71                 return NO;
72         } else {
73                 enum {
74                         UTF32LowShiftToUTF16High = 10,
75                         UTF32HighShiftToUTF16High,
76                         UTF16HighMask = 31,  //0b0000 0111 1100 0000
77                         UTF16LowMask  = 63,  //0b0000 0000 0011 1111
78                         UTF32LowMask = 1023, //0b0000 0011 1111 1111
79                         UTF16HighAdditiveMask = 55296, //0b1101 1000 0000 0000
80                         UTF16LowAdditiveMask  = 56320, //0b1101 1100 0000 0000
81                 };
82                 
83                 if (outHigh) {
84                         *outHigh = \
85                         ((in >> UTF32HighShiftToUTF16High) & UTF16HighMask) \
86                         | ((in >> UTF32LowShiftToUTF16High) & UTF16LowMask) \
87                         | UTF16HighAdditiveMask;
88                 }
89                 
90                 if (outLow) {
91                         *outLow = (in & UTF32LowMask) | UTF16LowAdditiveMask;
92                 }
93                 
94                 return YES;
95         }
99  * @brief Read a string from a file, assuming it to be UTF8
101  * If it can not be read as UTF8, it will be read as ASCII.
102  */
103 + (NSString *)stringWithContentsOfUTF8File:(NSString *)path
105         NSString        *string;
106         
107         if ((floor(kCFCoreFoundationVersionNumber) > kCFCoreFoundationVersionNumber10_3)) {
108                 NSError *error = nil;
109                 
110                 string = [NSString stringWithContentsOfFile:path
111                                                                                    encoding:NSUTF8StringEncoding 
112                                                                                           error:&error];
113                 
114                 if (error) {
115                         BOOL    handled = NO;
116                         
117                         if ([[error domain] isEqualToString:NSCocoaErrorDomain]) {
118                                 int             errorCode = [error code];
119                                 
120                                 //XXX - I'm sure these constants are defined somewhere, but I can't find them. -eds
121                                 if (errorCode == 260) {
122                                         //File not found.
123                                         string = nil;
124                                         handled = YES;
125                                         
126                                 } else if (errorCode == 261) {
127                                         /* Reason: File could not be opened using text encoding Unicode (UTF-8).
128                                         * Description: Text encoding Unicode (UTF-8) is not applicable.
129                                         *
130                                         * We couldn't read the file as UTF8.  Let the system try to determine the encoding.
131                                         */
132                                         NSError                         *newError = nil;
133                                         
134                                         string = [NSString stringWithContentsOfFile:path
135                                                                                                            encoding:NSASCIIStringEncoding
136                                                                                                                   error:&newError];
137                                         
138                                         //If there isn't a new error, we recovered reasonably successfully...
139                                         if (!newError) {
140                                                 handled = YES;
141                                         }
142                                 }
143                         }
144                         
145                         if (!handled) {
146                                 NSLog(@"Error reading %@:\n%@; %@.",path,
147                                           [error localizedDescription], [error localizedFailureReason]);
148                         }
149                 }
150                 
151         } else {
152                 NSData  *data = [NSData dataWithContentsOfFile:path];
153                 
154                 if (data) {
155                         string = [[[NSString alloc] initWithData:data
156                                                                                         encoding:NSUTF8StringEncoding] autorelease];
157                         if (!string) {
158                                 string = [[[NSString alloc] initWithData:data
159                                                                                                 encoding:NSASCIIStringEncoding] autorelease];                   
160                         }
161                         
162                         if (!string) {
163                                 NSLog(@"Error reading %@",path);
164                         }
165                 } else {
166                         //File not found
167                         string = nil;
168                 }
169         }
170         
171         return string;
174 //stringByUnescapingFromXMLWithEntities: was written by Peter Hosey and is explicitly released under the BSD license.
176  Copyright ¬© 2006 Peter Hosey
177  All rights reserved.
179  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
180  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
181  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
182  Neither the name of Peter Hosey nor the names of his contributors may be used to endorse or promote products derived from this software without specific prior written permission.
184  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
185  */
186 - (NSString *)stringByUnescapingFromXMLWithEntities:(NSDictionary *)entities
188         if (floor(NSAppKitVersionNumber) > NSAppKitVersionNumber10_3) {
189                 return [(NSString *)CFXMLCreateStringByUnescapingEntities(kCFAllocatorDefault, (CFStringRef)self, (CFDictionaryRef)entities) autorelease];
190         } else {
191                 //COMPAT 10.3
192                 
193                 if (!entities) {
194                         static const unichar nbsp = 0xa0;
195                         entities = [NSDictionary dictionaryWithObjectsAndKeys:
196                                 @"&",  @"amp",
197                                 @"<",  @"lt",
198                                 @">",  @"gt",
199                                 @"\"", @"quot",
200                                 @"'",  @"apos",
201                                 [NSString stringWithCharacters:&nbsp length:1], @"nbsp",
202                                 nil];
203                 }
204                 
205                 unsigned len = [self length];
206                 NSMutableString *result = [NSMutableString stringWithCapacity:len];
207                 NSScanner *scanner = [NSScanner scannerWithString:self];
208                 [scanner setCharactersToBeSkipped:[NSCharacterSet characterSetWithRange:(NSRange){ 0, 0 }]];
209                 
210                 NSString *chunk = nil;
211                 while (YES) { //Actual condition is below.
212                         chunk = nil;
213                         if ([scanner scanUpToString:@"&" intoString:&chunk]) {
214                                 [result appendString:chunk];
215                         }
216                         [scanner scanString:@"&" intoString:NULL];
217                         
218                         //Condition is here.
219                         if ([scanner scanLocation] >= len)
220                                 break;
221                         
222                         if ([scanner scanString:@"#" intoString:NULL]) {
223                                 NSString *hexIdentifier = nil;
224                                 if ([scanner scanString:@"x" intoString:&hexIdentifier] || [scanner scanString:@"X" intoString:&hexIdentifier]) {
225                                         //Probably hex.
226                                         unsigned unichar32 = 0xffff;
227                                         if (![scanner scanHexInt:&unichar32]) {
228                                                 [result appendFormat:@"&#%@", hexIdentifier];
229                                         } else if (![scanner scanString:@";" intoString:NULL]) {
230                                                 [result appendFormat:@"&#%@%u", hexIdentifier, unichar32];
231                                         } else {
232                                                 unichar high, low;
233                                                 if (getSurrogatesForUnicodeScalarValue(unichar32, &high, &low)) {
234                                                         [result appendFormat:@"%C%C", high, low];
235                                                 } else {
236                                                         [result appendFormat:@"%C", low];
237                                                 }
238                                         }
239                                 } else {
240                                         //Not hex. Hopefully decimal.
241                                         int unichar32 = 65535; //== 0xffff
242                                         if (![scanner scanInt:&unichar32]) {
243                                                 [result appendString:@"&#"];
244                                         } else if (![scanner scanString:@";" intoString:NULL]) {
245                                                 [result appendFormat:@"&#%i", unichar32];
246                                         } else {
247                                                 unichar high, low;
248                                                 if (getSurrogatesForUnicodeScalarValue(unichar32, &high, &low)) {
249                                                         [result appendFormat:@"%C%C", high, low];
250                                                 } else {
251                                                         [result appendFormat:@"%C", low];
252                                                 }
253                                         }
254                                 }
255                         } else {
256                                 //Not a numeric entity. Should be a named entity.
257                                 NSString *entityName = nil;
258                                 if (![scanner scanUpToString:@";" intoString:&entityName]) {
259                                         [result appendString:@"&"];
260                                 } else {
261                                         //Strip the semicolon.
262                                         NSString *entity = [entities objectForKey:entityName];
263                                         if (entity) {
264                                                 [result appendString:entity];
265                                                 
266                                         } else {
267                                                 NSLog(@"-[NSString(AIStringAdditions) stringByUnescapingFromXMLWithEntities]: Named entity %@ unknown.", entityName);
268                                         }
269                                         [scanner scanString:@";" intoString:NULL];
270                                 }
271                         }
272                 }
273                 
274                 return [NSString stringWithString:result];
275         }
278 @end
280 static BOOL getSurrogatesForUnicodeScalarValue(const UTF32Char scalar, unichar *outHigh, unichar *outLow) {
281         if(scalar <= 0xffff) {
282                 if(outHigh)
283                         *outHigh = 0x0000;
284                 if(outLow)
285                         *outLow  = scalar;
286                 return NO;
287         }
289         //note: names uuuuu, wwww, and xxxxx+ are taken from the Unicode book (section 3.9, table 3-4).
290         union {
291                 UTF32Char scalar;
292                 struct {
293                         unsigned unused:     11;
294                         unsigned uuuuu:       5;
295                         unsigned xxxxxx:      6;
296                         unsigned xxxxxxxxxx: 10;
297                 } components;
298         } componentsUnion = {
299                 .scalar = scalar
300         };
302         if(outHigh) {
303                 union {
304                         struct {
305                                 unsigned highPrefix: 6;
306                                 unsigned wwww:       4;
307                                 unsigned xxxxxx:     6;
308                         } highComponents;
309                         unichar codeUnit;
310                 } highUnion = {
311                         .highComponents = {
312                                 .highPrefix = 0x36, //0b110110
313                                 .wwww   = componentsUnion.components.uuuuu - 1,
314                                 .xxxxxx = componentsUnion.components.xxxxxx,
315                         }
316                 };
317                 *outHigh = highUnion.codeUnit;
318         }
320         if(outLow) {
321                 union {
322                         struct {
323                                 unsigned lowPrefix:   6;
324                                 unsigned xxxxxxxxxx: 10;
325                         } lowComponents;
326                         unichar codeUnit;
327                 } lowUnion = {
328                         .lowComponents = {
329                                 .lowPrefix = 0x37, //0b110111
330                                 .xxxxxxxxxx = componentsUnion.components.xxxxxxxxxx,
331                         }
332                 };
333                 *outLow = lowUnion.codeUnit;
334         };
336         return YES;